K-MEANS clusteringΒΆ

Run with:
bin/spark-submit examples/src/main/python/ml/kmeans_example.py
This example requires NumPy (http://www.numpy.org/)
from __future__ import print_function

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("KMeansExample")\
        .getOrCreate()

    # Loads data
    dataset = spark.read.format("libsvm").load("/opt/spark/data/mllib/sample_kmeans_data.txt")

    # Trains a K-Means model
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(dataset)

    # Make Predictions
    predictions = model.transform(dataset)

    # Evaluate Clustering by computing Silhouette Score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with Squared Euclidean Distance = " + str(silhouette))

    # Shows the result
    centers = model.clusterCenters()

    print("Cluster Centers: ")
    for center in centers:
        print(center)

spark.stop()

# Silhouette with
# squared euclidean distance = 0.9997530305375207
# Cluster Centers:
# [0.1 0.1 0.1]
# [9.1 9.1 9.1]